{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from bs4 import BeautifulSoup\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import lightgbm as lgb\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "stemmer = WordNetLemmatizer()\n",
    "from scipy import sparse\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import StrMethodFormatter\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_text(review, remove_stopwords=False, Lem=False):\n",
    "    review_text = BeautifulSoup(review, \"html.parser\").get_text()\n",
    "    review_text = re.sub('[^a-zA-Z]',' ', review_text)\n",
    "    review_text = re.sub('\\s+',' ', review_text)\n",
    "    words = review_text.lower().split()\n",
    "    if remove_stopwords:\n",
    "        stops = set(stopwords.words(\"english\"))\n",
    "        words = [w for w in words if not w in stops]\n",
    "    if Lem:\n",
    "        words = [stemmer.lemmatize(w) for w in words] # Lemmatization\n",
    "    review_text = (' '.join([word for word in words]))\n",
    "    return(review_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Train01.csv', encoding='ISO-8859-1')\n",
    "test_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Test01.csv', encoding='ISO-8859-1')\n",
    "FeatureNames = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\FeatureNames.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "test_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]\n",
    "test_other_models = test_other_models[['id','Price_Log_Pred']]\n",
    "\n",
    "train = pd.merge(train_orig, train_other_models, on='id')\n",
    "test = pd.merge(test_orig, test_other_models, on='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Price_Log'] = np.log10(train['Price']+1)\n",
    "train.hist(column='Price_Log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeatureNames = FeatureNames['x'].values.tolist()\n",
    "FeatureNames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))\n",
    "test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.7, ngram_range=(1,3))\n",
    "vectorizer.fit(train['Synopsis2'])\n",
    "\n",
    "test_tf = vectorizer.transform(test['Synopsis2'])\n",
    "test_tf = sparse.hstack((test_tf,sparse.csr_matrix(np.asmatrix(test[FeatureNames].values))))\n",
    "print(test_tf.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_list = list(train.FOLD_NUM.unique())\n",
    "fold_list.sort()\n",
    "fold_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param = {\n",
    "    'learning_rate': 0.01,\n",
    "    'max_depth': 10,\n",
    "    'min_data_in_leaf': 5,\n",
    "    'bagging_freq': 2,\n",
    "    'bagging_fraction': 0.8,\n",
    "    'feature_fraction': 0.5,\n",
    "    'boost': 'gbdt',    \n",
    "    'objective': 'regression',\n",
    "    'metric': 'rmse',\n",
    "    'seed': 392\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = vectorizer.transform(temp_train['Synopsis2'])\n",
    "    temp_val_tf = vectorizer.transform(temp_val['Synopsis2'])\n",
    "    \n",
    "    temp_train_tf = sparse.hstack((temp_train_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))\n",
    "    temp_val_tf = sparse.hstack((temp_val_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))\n",
    "    \n",
    "    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])\n",
    "    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])\n",
    "    \n",
    "    model = lgb.train(param, trn_data, 20000,valid_sets = val_data, verbose_eval=20, early_stopping_rounds = 100)\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)\n",
    "    \n",
    "    print(\"Fold RMSLE = \",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = model.predict(test_tf, num_iteration=model.best_iteration)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + model.predict(test_tf, num_iteration=model.best_iteration)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CV_SCORED_DATA.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\201901001_LGB01_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LGB'] = sub_data_preds\n",
    "test['Price_Log_Pred_LGB'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\201901001_LGB01_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds2 = (10**sub_data_preds) - 1\n",
    "pd.DataFrame(sub_data_preds2).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission['Price'] = sub_data_preds2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission.to_excel('C:\\\\Kaggle\\\\BooksPrice\\\\Submissions\\\\201901001_LGB01_DS.xlsx', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
